!pip install spotipy
Collecting spotipy
  Downloading spotipy-2.25.1-py3-none-any.whl.metadata (5.1 kB)
Collecting redis>=3.5.3 (from spotipy)
  Downloading redis-5.2.1-py3-none-any.whl.metadata (9.1 kB)
Requirement already satisfied: requests>=2.25.0 in /usr/local/lib/python3.11/dist-packages (from spotipy) (2.32.3)
Requirement already satisfied: urllib3>=1.26.0 in /usr/local/lib/python3.11/dist-packages (from spotipy) (2.3.0)
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests>=2.25.0->spotipy) (3.4.1)
Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests>=2.25.0->spotipy) (3.10)
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests>=2.25.0->spotipy) (2025.1.31)
Downloading spotipy-2.25.1-py3-none-any.whl (31 kB)
Downloading redis-5.2.1-py3-none-any.whl (261 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 261.5/261.5 kB 11.6 MB/s eta 0:00:00
# Import necessary libraries
from scipy.spatial.distance import cdist
import difflib
from collections import defaultdict
import os
import numpy as np
import pandas as pd
from yellowbrick.target import FeatureCorrelation
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt

from spotipy.oauth2 import SpotifyClientCredentials
import spotipy
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from scipy.spatial.distance import cdist


import warnings

# Ignore warnings
warnings.filterwarnings("ignore")
# Load datasets
data = pd.read_csv("/content/data.csv")
genre_data = pd.read_csv("/content/data_by_genres.csv")
year_data = pd.read_csv("/content/data_by_year.csv")

# Define feature columns
feature_names = [
    "acousticness", "danceability", "energy", "instrumentalness",
    "liveness", "loudness", "speechiness", "tempo", "valence",
    "duration_ms", "explicit", "key", "mode", "year",
]

# Extract features (X) and target (y)
X, y = data[feature_names], data["popularity"]

# Convert feature names to a NumPy array
features = np.array(feature_names)
# Instantiate the FeatureCorrelation visualizer
visualizer = FeatureCorrelation(labels=features)

# Set figure size
plt.rcParams["figure.figsize"] = (20, 20)

# Fit the visualizer and show the plot
visualizer.fit(X, y)
visualizer.show()

<Axes: title={'center': 'Features correlation with dependent variable'}, xlabel='Pearson Correlation'>
# Function to extract decade from year
def get_decade(year):
    period_start = int(year / 10) * 10
    return "{}s".format(period_start)

# Apply function to create a new 'decade' column
data["decade"] = data["year"].apply(get_decade)

# Plot the distribution of songs across decades
sns.set(rc={"figure.figsize": (11, 6)})
sns.countplot(data["decade"])
<Axes: xlabel='count', ylabel='decade'>

# Define sound-related features
sound_features = ["acousticness", "danceability", "energy", "instrumentalness", "liveness", "valence"]

# Plot sound features over the years
fig = px.line(year_data, x="year", y=sound_features)
fig.show()
# Select the top 10 genres by popularity
top10_genres = genre_data.nlargest(10, "popularity")

# Plot genre characteristics in a grouped bar chart
fig = px.bar(
    top10_genres,
    x="genres",
    y=["valence", "energy", "danceability", "acousticness"],
    barmode="group",
)
fig.show()
# Define clustering pipeline for genres
cluster_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=10))
])

# Select numerical columns for clustering
X = genre_data.select_dtypes(np.number)

# Fit the pipeline and predict clusters
cluster_pipeline.fit(X)
genre_data["cluster"] = cluster_pipeline.predict(X)
# Define t-SNE pipeline
tsne_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("tsne", TSNE(n_components=2, verbose=1))
])

# Transform genre data using t-SNE
genre_embedding = tsne_pipeline.fit_transform(X)

# Create a DataFrame for visualization
projection = pd.DataFrame(columns=["x", "y"], data=genre_embedding)
projection["genres"] = genre_data["genres"]
projection["cluster"] = genre_data["cluster"]

# Plot the clustered genres in a scatter plot
fig = px.scatter(projection, x="x", y="y", color="cluster", hover_data=["x", "y", "genres"])
fig.show()
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 2973 samples in 0.005s...
[t-SNE] Computed neighbors for 2973 samples in 0.395s...
[t-SNE] Computed conditional probabilities for sample 1000 / 2973
[t-SNE] Computed conditional probabilities for sample 2000 / 2973
[t-SNE] Computed conditional probabilities for sample 2973 / 2973
[t-SNE] Mean sigma: 0.777516
[t-SNE] KL divergence after 250 iterations with early exaggeration: 76.102470
[t-SNE] KL divergence after 1000 iterations: 1.393200
# Define clustering pipeline for songs
song_cluster_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("kmeans", KMeans(n_clusters=20, verbose=False))
])

# Select numerical columns for clustering
X = data.select_dtypes(np.number)
number_cols = list(X.columns)

# Fit the pipeline and assign cluster labels
song_cluster_pipeline.fit(X)
song_cluster_labels = song_cluster_pipeline.predict(X)
data["cluster_label"] = song_cluster_labels
# Define PCA pipeline
pca_pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("PCA", PCA(n_components=2))
])

# Transform song data using PCA
song_embedding = pca_pipeline.fit_transform(X)

# Create a DataFrame for visualization
projection = pd.DataFrame(columns=["x", "y"], data=song_embedding)
projection["title"] = data["name"]
projection["cluster"] = data["cluster_label"]

# Plot the clustered songs in a scatter plot
fig = px.scatter(projection, x="x", y="y", color="cluster", hover_data=["title"])
fig.show()
# Authenticate with Spotify API
sp = spotipy.Spotify(
    auth_manager=SpotifyClientCredentials(
        client_id=os.environ["SPOTIFY_CLIENT_ID"],
        client_secret=os.environ["SPOTIFY_CLIENT_SECRET"],
    )
)
def find_song(name, year):
    song_data = defaultdict()
    results = sp.search(q="track: {} year: {}".format(name, year), limit=1)

    # If no results are found, return None
    if results["tracks"]["items"] == []:
        return None

    results = results["tracks"]["items"][0]
    track_id = results["id"]
    audio_features = sp.audio_features(track_id)[0]

    # Store song information
    song_data["name"] = [name]
    song_data["year"] = [year]
    song_data["explicit"] = [int(results["explicit"])]
    song_data["duration_ms"] = [results["duration_ms"]]
    song_data["popularity"] = [results["popularity"]]

    for key, value in audio_features.items():
        song_data[key] = [value]

    return pd.DataFrame(song_data)
def get_song_data(song, spotify_data):
    try:
        # Look up the song in the dataset
        song_data = spotify_data[
            (spotify_data["name"] == song["name"]) & (spotify_data["year"] == song["year"])
        ].iloc[0]
        return song_data
    except IndexError:
        return find_song(song["name"], song["year"])
def get_mean_vector(song_list, spotify_data):
    song_vectors = []

    for song in song_list:
        song_data = get_song_data(song, spotify_data)

        if song_data is None:
            print(f"Warning: {song['name']} does not exist in Spotify or in database")
            continue

        song_vector = song_data[number_cols].values
        song_vectors.append(song_vector)

    song_matrix = np.array(song_vectors)
    return np.mean(song_matrix, axis=0)
def flatten_dict_list(dict_list):
    flattened_dict = defaultdict(list)

    for dictionary in dict_list:
        for key, value in dictionary.items():
            flattened_dict[key].append(value)

    return flattened_dict
def recommend_songs(song_list, spotify_data, n_songs=10):
    metadata_cols = ["name", "year", "artists"]
    song_dict = flatten_dict_list(song_list)
    song_center = get_mean_vector(song_list, spotify_data)

    scaler = song_cluster_pipeline["scaler"]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))

    distances = cdist(scaled_song_center, scaled_data, "cosine")
    index = list(np.argsort(distances)[:, :n_songs][0])

    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs["name"].isin(song_dict["name"])]

    return rec_songs[metadata_cols].to_dict(orient="records")
def recommend_songs(song_list, spotify_data, n_songs=10, metric="cosine"):

    metadata_cols = ["name", "year", "artists"]
    song_dict = flatten_dict_list(song_list)
    song_center = get_mean_vector(song_list, spotify_data)

    # Standardize the dataset features
    scaler = song_cluster_pipeline["scaler"]
    scaled_data = scaler.transform(spotify_data[number_cols])
    scaled_song_center = scaler.transform(song_center.reshape(1, -1))

    # Compute distances using the specified metric
    distances = cdist(scaled_song_center, scaled_data, metric)
    index = list(np.argsort(distances)[:, :n_songs][0])
    rec_songs = spotify_data.iloc[index]
    rec_songs = rec_songs[~rec_songs["name"].isin(song_dict["name"])]

    print(f"Top {n_songs} recommendations using {metric} distance:\n")
    print(rec_songs[metadata_cols])
    return rec_songs[metadata_cols].to_dict(orient="records")

# Example usage with different distance metrics
sample_songs = [{"name": "Blinding Lights", "year": 2020}]

print("Using Cosine Similarity:")
recommend_songs(sample_songs, data, metric="cosine")

print("\nUsing Euclidean Distance:")
recommend_songs(sample_songs, data, metric="euclidean")

print("\nUsing Manhattan Distance:")
recommend_songs(sample_songs, data, metric="cityblock")
Using Cosine Similarity:
Top 10 recommendations using cosine distance:

                            name  year              artists
17428                    Secrets  2009      ['OneRepublic']
18039   They Don't Know About Us  2012    ['One Direction']
19037                    Thunder  2017  ['Imagine Dragons']
19627          Forever After All  2020       ['Luke Combs']
17410                  Fireflies  2009         ['Owl City']
37960                Getaway Car  2017     ['Taylor Swift']
57363            Black And White  2020      ['Niall Horan']
108337                Magic Shop  2018              ['BTS']
18970       Versace on the Floor  2016       ['Bruno Mars']

Using Euclidean Distance:
Top 10 recommendations using euclidean distance:

                           name  year                artists
19037                   Thunder  2017    ['Imagine Dragons']
19627         Forever After All  2020         ['Luke Combs']
18039  They Don't Know About Us  2012      ['One Direction']
17410                 Fireflies  2009           ['Owl City']
37960               Getaway Car  2017       ['Taylor Swift']
17428                   Secrets  2009        ['OneRepublic']
57363           Black And White  2020        ['Niall Horan']
19401                  Drunk Me  2018  ['Mitchell Tenpenny']
18970      Versace on the Floor  2016         ['Bruno Mars']

Using Manhattan Distance:
Top 10 recommendations using cityblock distance:

                            name  year                                 artists
18039   They Don't Know About Us  2012                       ['One Direction']
37960                Getaway Car  2017                        ['Taylor Swift']
107672          Mind (feat. Kai)  2015  ['Jack Ü', 'Skrillex', 'Diplo', 'kai']
91872                     Hooked  2018                        ["Why Don't We"]
19037                    Thunder  2017                     ['Imagine Dragons']
19401                   Drunk Me  2018                   ['Mitchell Tenpenny']
17428                    Secrets  2009                         ['OneRepublic']
19627          Forever After All  2020                          ['Luke Combs']
91983      Different 'Round Here  2019                         ['Riley Green']
[{'name': "They Don't Know About Us",
  'year': 2012,
  'artists': "['One Direction']"},
 {'name': 'Getaway Car', 'year': 2017, 'artists': "['Taylor Swift']"},
 {'name': 'Mind (feat. Kai)',
  'year': 2015,
  'artists': "['Jack Ü', 'Skrillex', 'Diplo', 'kai']"},
 {'name': 'Hooked', 'year': 2018, 'artists': '["Why Don\'t We"]'},
 {'name': 'Thunder', 'year': 2017, 'artists': "['Imagine Dragons']"},
 {'name': 'Drunk Me', 'year': 2018, 'artists': "['Mitchell Tenpenny']"},
 {'name': 'Secrets', 'year': 2009, 'artists': "['OneRepublic']"},
 {'name': 'Forever After All', 'year': 2020, 'artists': "['Luke Combs']"},
 {'name': "Different 'Round Here", 'year': 2019, 'artists': "['Riley Green']"}]